Summary

  • Reading in data in R
  • Heatmaps
    • Choice of the distance metric matters
    • Clustering genes
    • The heatmap has an option “scale” by default
    • Label sample types
    • The choice of the linkage method in clustering matters
    • heatmap.2()

http://sebastianraschka.com/Articles/heatmaps_in_r.html


1. Reading in data

In this section we’ll continue using mpg and CRC dataset.

library(Biobase)

################################################################
# Read data
################################################################
CRC <- read.csv("./data/CRC_train.csv")
CRC.prot <- CRC[,1:72]
CRC.anno <- CRC[,73:79]

# Deal with missing value
# First option: remove the samples with missing values
dim(na.omit(CRC.prot))
## [1] 89 72
# Second option: impute the missing values
random.imp <- function (a){
  missing <- is.na(a)
  n.missing <- sum(missing)
  a.obs <- a[!missing]
  imputed <- a
  # imputed[missing] <- sample(a.obs, n.missing, replace=TRUE)
  imputed[missing] <- median(a.obs)
  # imputed[missing] <- 0
  return (imputed)
}
pMiss <- function(x){sum(is.na(x))/length(x)*100}
samplemissing <- apply(CRC.prot,1,pMiss)
# Only keep the samples with less than 5% missing values
selectedSamples <- which(samplemissing <= 5) 
imputed.CRC.prot <- t(apply(CRC.prot[selectedSamples,], 1, function(x) random.imp(x)))
imputed.CRC.anno <- CRC.anno[selectedSamples,]

2. Heatmaps

2.1 Correlation heatmap

library(ggplot2)
library(tidyr)
library(RColorBrewer)

CRC.biomarkers <- CRC[,c("CP", "PON1", "SERPINA3", "LRG1", "TIMP1")]
cor.biomarker <- cor(CRC.biomarkers, use = "pairwise.complete.obs")
cor.biomarker <- as.data.frame(cor.biomarker)
cor.biomarker$Protein1 <- rownames(cor.biomarker)
cor.biomarker <- cor.biomarker %>% gather(Protein2, Correlation, -Protein1)
g <- ggplot(cor.biomarker, aes(Protein1, Protein2))
g + geom_tile(aes(fill=Correlation)) + scale_fill_gradientn(colours = brewer.pal(5,"Spectral"))

################################################################
################################################################
####################   H E A T M A P S  ########################
#           Iconic graphics for high-throughput data
################## CAUTION: HIDDEN PITFALLS ####################
################################################################
################################################################

################################################################
# Heatmap of individual values of proteins
################################################################
heatmap(t(imputed.CRC.prot))

?heatmap
# Change the font of row and column label
heatmap(t(imputed.CRC.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2))

# Don't do cluster on rows
heatmap(t(imputed.CRC.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Rowv = NA)

# Don't do cluster on columns
heatmap(t(imputed.CRC.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA)

Note 2.2: Choice of the distance metric matters

library(bioDist)

# toy example: 3 genes, 5 samples
gene1 <- c(1, 6, 2, 4, 7)
gene2 <- gene1 + 4 
gene3 <- gene2/3 + c(0, 2, 0, 4, 0)
e <- rbind(gene1, gene2, gene3)
dimnames(e) <- list(paste("gene", 1:3, sep=""), paste("sample", 1:5, sep=""))
e
##        sample1   sample2 sample3  sample4   sample5
## gene1 1.000000  6.000000       2 4.000000  7.000000
## gene2 5.000000 10.000000       6 8.000000 11.000000
## gene3 1.666667  5.333333       2 6.666667  3.666667
# plot
e1 <- as.data.frame(e)
e1$gene <- rownames(e1)
e1 <- e1 %>% gather(Sample, Abundance, -gene)
ggplot(e1) + 
  geom_line(aes(x=Sample, y = Abundance, group = gene, colour=gene))

# default: distance fnction eucledian
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian distance")

# correlation
heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation distance")

Note 2.3: Clustering genes

genes are multivariate objects (=observations) in the space of samples (=variables) if we are not interested in similarity of abundance but in similarity of the direction of change, then it will help to standardize the observations across variables

now each gene has mean 0 and sample variance 1 across arrays

library(genefilter)
e_centerScale <- ( e - rowMeans(e) ) / rowSds(e)

e1 <- as.data.frame(e_centerScale)
e1$gene <- rownames(e1)
e1 <- e1 %>% gather(Sample, Abundance, -gene)
ggplot(e1) + 
  geom_line(aes(x=Sample, y = Abundance, group = gene, colour=gene))

# Eucledian distance is sensitive to centering and scaling
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, raw")

heatmap(e_centerScale, distfun=euc, margins=c(10,10), main="Eucledian, centered and scaled")

# Correlation distance is not sensitive to centering and scaling
heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation, raw")

heatmap(e_centerScale, distfun=cor.dist, margins=c(10,10), main="Correlation, centered and scaled")

Note that centering and scaling of genes across samples + correlation distance affect the order of the samples). My choice is to use correlation on the original values, or to compute dendrograms separately, save the order of genes or samples, and use “image” to visualize

Note 2.4: The heatmap has an option “scale” by default

However it only affects the color, and not the calculation of dendrograms. It’s usually better to make custom color definitions

# scaling of color matters for unscaled data
heatmap(e, distfun=euc, margins=c(10,10), scale="none", main="Eucledian, raw")

heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, scaled")

# Conclusion: genes 1 and 2 are far from each other on the dendrogram (because of Eucledian distance) but have same color patterns of intensities (because of scaled colors)


# scaling of color irrelevant if data themselves are scaled already
heatmap(e_centerScale, distfun=euc, margins=c(10,10), scale="none")

heatmap(e_centerScale, distfun=euc, margins=c(10,10))

Example with real dataset

# Default: raw data, scaled by row (i.e. gene)
heatmap(t(imputed.CRC.prot))

col<- colorRampPalette(c("red", "white", "blue"))(256)
heatmap(t(imputed.CRC.prot), col =  col)

# change color code: non-standardized expressions
# equal-spaced breaks for values of abundance
summary(as.vector(t(imputed.CRC.prot))) # min ~ 6, max ~ 25
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.583  10.496  13.086  13.535  16.058  24.591
quantile.range <- quantile(imputed.CRC.prot, probs = seq(0, 1, 0.01))
palette.breaks <- seq(quantile.range["5%"], quantile.range["95%"], by = 0.5)
color.palette  <- colorRampPalette(brewer.pal(11,"Spectral"))(length(palette.breaks) - 1)
heatmap(t(imputed.CRC.prot), col = color.palette, scale = "none", breaks= palette.breaks)

# change color code: protein-standardized expressions
# quantile-spaced breaks in positive and negative directions
imputed_CRC_prot_centerScale <-  ( t(imputed.CRC.prot) - rowMeans(t(imputed.CRC.prot)) ) / rowSds(t(imputed.CRC.prot))

exprs_brk <- c( quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale < 0], probs=seq(0, 1, length=10)), 0, quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale > 0], seq(0, 1, length=10)))
colors <- colorRampPalette(brewer.pal(11,"Spectral"))(length(exprs_brk)-1)
heatmap(t(imputed.CRC.prot), col = colors, breaks = exprs_brk)

Note 2.5: Label sample types

myColor <- rep("blue", nrow(imputed.CRC.prot))
myColor[imputed.CRC.anno$Sub_group == "CRC"] <- "red" 
myColor[imputed.CRC.anno$Sub_group == "Healthy"] <- "yellow" 
heatmap(t(imputed.CRC.prot), col = colors, breaks = exprs_brk, ColSideColors=myColor)

Note 2.5: The choice of the linkage method in clustering matters

Distance options: euclidean (default), maximum, canberra, binary, minkowski, manhattan

Cluster options: complete (default), single, average, mcquitty, median, centroid, ward

# Correlation distance
heatmap(t(imputed.CRC.prot),
        distfun = cor.dist,
        col = colors, 
        breaks = exprs_brk, 
        ColSideColors=myColor,
        Rowv = NA) 

# Euclidean distance
# method "complete" by default
heatmap(t(imputed.CRC.prot),
        distfun = euc,
        col = colors, 
        breaks = exprs_brk, 
        ColSideColors= myColor,
        Rowv = NA) 

# clusters in the sample space: note the use of (t)
# single linkage
hc <- hclust(dist(imputed.CRC.prot), method="single" )
heatmap(t(imputed.CRC.prot),
        col= colors, 
        breaks=exprs_brk, 
        ColSideColors=myColor,
        Rowv = NA, 
        Colv = as.dendrogram(hc)) # apply default clustering method

# Average 
hc <- hclust(dist(imputed.CRC.prot), method="average" )
heatmap(t(imputed.CRC.prot),
        col = colors, 
        breaks = exprs_brk, 
        ColSideColors = myColor,
        Rowv = NA, 
        Colv = as.dendrogram(hc)) # apply default clustering method

# Try manhattan distance
manhattan_dis <- dist(imputed.CRC.prot, method = "manhattan")
hc <- hclust(manhattan_dis)
heatmap(t(imputed.CRC.prot),
        col = colors, 
        breaks = exprs_brk, 
        ColSideColors = myColor,
        Rowv = NA, 
        Colv = as.dendrogram(hc)) # apply default clustering method

# Try maximum distance
maximum_dis <- dist(imputed.CRC.prot, method = "maximum")
hc <- hclust(maximum_dis)
heatmap(t(imputed.CRC.prot),
        col = colors, 
        breaks = exprs_brk, 
        ColSideColors = myColor,
        Rowv = NA, 
        Colv = as.dendrogram(hc)) # apply default clustering method

Note 2.7: heatmap.2()

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(t(imputed.CRC.prot), srtCol=45)